This is global analysis of all of the muations for all patients that we have RNA-seq for.
Only variant with at least 3% AF and 100x coverage at the given location (must be in all samples from all patient) will be kept. I will also remove patients whose bulk RNA-seq clusters in a weird way (e.g. GC RNA-seq does not cluster with other GC samples).
For any position that is not fulfiling the above criteria the position is dropped.
Finally, I remove troublesome MT position: 295, 2617, 13710 - reported as mutated in RNA-seq 3107 - This position is next to an deletion and RNA-seq and WGS do not work well together. 302 - 317 - This is the variable region of d-loop. When I manually check it there are lot of mutations in a stretch (C)7T(C)5 and they affect alignment of T nucleotide in a stretch https://www.nature.com/articles/cr200969
The following code can be run in bash to collect the frequencies of nucleotides. Requires samtools (http://www.htslib.org/) and pysamstats (https://github.com/alimanfoo/pysamstats). Here I used samtools 1.9 and pysamstats 1.1.2 (pysam 0.15.2)
for i in 0062 0231 0412 0553 0832 0848 0931 1075 1218 1281 1533 1546 1566 1661 1790;
do
for ii in BE GC NE D2 ;
do
echo $i
echo $ii
# samtools index ../AHM_$i"_"$ii"_Aligned_MT.bam"
############ RNA-seq data only have 3 qualities or read aligment based on the number of regions the read was mapped to. Keep only reads that map to a single location (-min-mapq=200)
############ The quality of based was variable and I only keep the best nucleotides with base quality above 34
pysamstats --type variation --min-mapq=200 --min-baseq=34 -f ../../Files/hg37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa ../../BAM/RNA-seq/AHM_$i"_"$ii"_Aligned_MT.bam" >./AHM_$i"_"$ii"_Aligned_MT_q.counts"
done
done
library("edgeR")
library("ggplot2")
library("pheatmap")
library("tidyr")
library("reshape2")
library("RColorBrewer")
library("corrplot")
# function for calculation of mitodistance between samples
mitodist <- function(x, y, z = 100, vaf = 0.03, count = 0, positions = NA) {
# z - coverage vaf - minimum VAF count - combined count in two samples for given
# VAF
# merge locations as they do not always overlap
tmp <- merge(x, y, by = c("chrom", "pos"), all = TRUE, sort = TRUE)
tmp <- tmp[!(tmp$pos %in% positions), ]
# replace the NA with 0 at the counts table
tmp[, c(4, 7:10, 12, 15:18)][is.na(tmp[, c(4, 7:10, 12, 15:18)])] <- 0
# make matrix of frequencies
x2 <- as.matrix(tmp[, 7:10]/rowSums(as.matrix(tmp[, 7:10])))
x2[is.na(x2)] <- 0
y2 <- as.matrix(tmp[, 15:18]/rowSums(as.matrix(tmp[, 15:18])))
y2[is.na(y2)] <- 0
# change counts to 0 if they are below indicated vaf in both samples (1 sample
# above given vaf is required to keep the allel)
tmp[, 7:10][x2 < vaf & y2 < vaf] <- 0
tmp[, 15:18][x2 < vaf & y2 < vaf] <- 0
# change counts to 0 if they are below required coverage for the given alelle
# (default is minimum of 5 reads per pair/sum of samples)
tmp[, 7:10][(tmp[, 7:10] + tmp[, 15:18]) < count] <- 0
tmp[, 15:18][(tmp[, 7:10] + tmp[, 15:18]) < count] <- 0
# recalculate the frequencies
x2 <- as.matrix(tmp[, 7:10]/rowSums(as.matrix(tmp[, 7:10])))
x2[is.na(x2)] <- 0
y2 <- as.matrix(tmp[, 15:18]/rowSums(as.matrix(tmp[, 15:18])))
y2[is.na(y2)] <- 0
# get the indicator tables. This table check the coverage at each position. It
# has to be larger than indicated coverage (default is 100)
x1 <- as.matrix(ifelse(rowSums(as.matrix(tmp[, 7:10])) > z, 1, 0))
y1 <- as.matrix(ifelse(rowSums(as.matrix(tmp[, 15:18])) > z, 1, 0))
# calculate square root of absolute AF difference
AF_dist <- sqrt(abs(x2 - y2))
# calculate distance
dist <- (sum(AF_dist * as.vector(x1 * y1)))/sum(x1 * y1)
return(dist)
}
First, I will check the phenotypes of the samples to confirm that they are what they should be. In another analysis I noticed that some samples seem to cluster with wrong tissue, i.e. NE clusters with gastric tissue.
# Names of samples:
samples <- c("0062", "0231", "0412", "0553", "0832", "0848", "0931", "1075", "1218",
"1281", "1533", "1566", "1661", "1790") #RNA-seq data
conditions <- c("BE", "GC", "D2", "NE") # RNA-seq data
files.dir <- "~/Dropbox/Postdoc/2019-12-29_BE2020/RNA-seq/"
# EAC and normal data (tmp, linear scale)
bulk.data <- readRDS(paste0(files.dir, "/EAC-all/count_codingGenes.rds"))
IDs <- grep(paste(samples, collapse = "|"), colnames(bulk.data), value = TRUE)
bulk.data <- bulk.data[, IDs]
# get information about the samples from EAC
metadata <- readRDS(paste0(files.dir, "//EAC-all/samplesAnnotation.rds"))
metadata <- metadata[IDs, 6:8]
group <- factor(metadata$sampleGroup) #Levels: Barretts Duodenum Gastric Squamous
y <- DGEList(bulk.data, group = group)
# #keep only genes with expression above 1 cpm in at least 3 samples
keep <- rowSums(cpm(y) > 1) >= 3
y <- y[keep, , keep.lib.sizes = FALSE]
# get normalization factors
y <- calcNormFactors(y)
# Start performing diff expression - get sample desing model design <-
# model.matrix(~ batch + group)
design <- model.matrix(~0 + group)
rownames(design) <- colnames(y)
colnames(design) <- levels(group)
# Estimate disparsion
y <- estimateDisp(y, design, robust = TRUE)
plotBCV(y)
# Fit general linear model data
fit <- glmQLFit(y, design, robust = TRUE)
plotQLDisp(fit)
points <- rep(c(0, 1, 2), 2)
colors <- rep(c("blue", "darkgreen", "red"), 2)
p <- plotMDS(y, col = colors[group], pch = points[group], plot = FALSE)
p2 <- data.frame(p$x, p$y, .Tissue = metadata$sampleGroup, .Batch = metadata$batch)
ggplot(data = p2) + geom_point(mapping = aes(x = p.x, y = p.y, colour = .Tissue),
show.legend = TRUE) + theme_bw() + labs(x = paste0(p$axislabel, " 1"), y = paste0(p$axislabel,
" 2")) + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("MDS analysis")
ggplot(data = p2) + geom_point(mapping = aes(x = p.x, y = p.y, colour = .Batch),
show.legend = TRUE) + theme_bw() + labs(x = paste0(p$axislabel, " 1"), y = paste0(p$axislabel,
" 2")) + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("MDS analysis")
geneMatrix <- cpm(y, prior.count = 1, log = TRUE, normalized.lib.sizes = TRUE)
countVar <- apply(geneMatrix, 1, var)
highVar <- order(countVar, decreasing = TRUE)[1:500]
hmDat <- geneMatrix[highVar, ]
pca <- prcomp(t(hmDat), center = TRUE, scale. = TRUE)
loadings <- data.frame(pca$x, .Tissue = metadata$sampleGroup, .Batch = metadata$batch)
ggplot(data = loadings) + geom_point(mapping = aes(x = PC1, y = PC2, colour = .Tissue),
show.legend = TRUE) + theme_bw() + labs(x = paste0("PC1, ", round(summary(pca)$importance[2,
1] * 100, 2), "% variance"), y = paste0("PC2, ", round(summary(pca)$importance[2,
2] * 100, 2), "% variance")) + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("PCA - 500 HVGs")
ggplot(data = loadings) + geom_point(mapping = aes(x = PC1, y = PC2, colour = .Batch),
show.legend = TRUE) + theme_bw() + labs(x = paste0("PC1, ", round(summary(pca)$importance[2,
1] * 100, 2), "% variance"), y = paste0("PC2, ", round(summary(pca)$importance[2,
2] * 100, 2), "% variance")) + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("PCA - 500 HVGs")
tmp <- pheatmap(hmDat, clustering_method = "ward.D2", main = "500 HVGs, all data",
show_rownames = FALSE, annotation_col = metadata[, c(1, 3), drop = FALSE], cutree_cols = 4)
dodgy.samples <- grep(paste(c("0412", "1218"), collapse = "|"), colnames(bulk.data),
value = TRUE)
metadata$is.dodgy <- ifelse(rownames(metadata) %in% dodgy.samples, "yes", "no")
pheatmap(hmDat, clustering_method = "ward.D2", main = "500 HVGs, all data", show_rownames = FALSE,
annotation_col = metadata[, c(1, 3, 4), drop = FALSE], cutree_cols = 4)
Samples originating from patients AHM0412 and AHM1218 are not very good. I will remove them from subsequent analysis.
# Names of samples:
samples <- c("0062", "0231", "0412", "0553", "0832", "0848", "0931", "1075", "1218",
"1281", "1533", "1566", "1661", "1790") #RNA-seq data
samples <- samples[!(samples %in% c("0412", "1218"))] #RNA-seq data
conditions <- c("BE", "GC", "D2", "NE") # RNA-seq data
files.dir <- "~/Dropbox/Postdoc/2019-12-29_BE2020/RNA-seq/"
# EAC and normal data (tmp, linear scale)
bulk.data <- readRDS(paste0(files.dir, "/EAC-all/count_codingGenes.rds"))
IDs <- grep(paste(samples, collapse = "|"), colnames(bulk.data), value = TRUE)
bulk.data <- bulk.data[, IDs]
# get information about the samples from EAC
metadata <- readRDS(paste0(files.dir, "//EAC-all/samplesAnnotation.rds"))
metadata <- metadata[IDs, 6:8]
group <- factor(metadata$sampleGroup) #Levels: Barretts Duodenum Gastric Squamous
y <- DGEList(bulk.data, group = group)
# #keep only genes with expression above 1 cpm in at least 3 samples
keep <- rowSums(cpm(y) > 1) >= 3
y <- y[keep, , keep.lib.sizes = FALSE]
# get normalization factors
y <- calcNormFactors(y)
# Start performing diff expression - get sample desing model design <-
# model.matrix(~ batch + group)
design <- model.matrix(~0 + group)
rownames(design) <- colnames(y)
colnames(design) <- levels(group)
# Estimate disparsion
y <- estimateDisp(y, design, robust = TRUE)
plotBCV(y)
# Fit general linear model data
fit <- glmQLFit(y, design, robust = TRUE)
plotQLDisp(fit)
points <- rep(c(0, 1, 2), 2)
colors <- rep(c("blue", "darkgreen", "red"), 2)
p <- plotMDS(y, col = colors[group], pch = points[group], plot = FALSE)
p2 <- data.frame(p$x, p$y, .names = metadata$sampleGroup, .names2 = metadata$batch)
ggplot(data = p2) + geom_point(mapping = aes(x = p.x, y = p.y, colour = .names),
show.legend = TRUE) + theme_bw() + labs(x = paste0(p$axislabel, " 1"), y = paste0(p$axislabel,
" 2")) + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("MDS analysis")
ggplot(data = p2) + geom_point(mapping = aes(x = p.x, y = p.y, colour = .names2),
show.legend = TRUE) + theme_bw() + labs(x = paste0(p$axislabel, " 1"), y = paste0(p$axislabel,
" 2")) + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("MDS analysis")
geneMatrix <- cpm(y, prior.count = 1, log = TRUE, normalized.lib.sizes = TRUE)
countVar <- apply(geneMatrix, 1, var)
highVar <- order(countVar, decreasing = TRUE)[1:500]
hmDat <- geneMatrix[highVar, ]
pca <- prcomp(t(hmDat), center = TRUE, scale. = TRUE)
loadings <- data.frame(pca$x, .names = metadata$sampleGroup, .names2 = metadata$batch)
ggplot(data = loadings) + geom_point(mapping = aes(x = PC1, y = PC2, colour = .names),
show.legend = TRUE) + theme_bw() + labs(x = paste0("PC1, ", round(summary(pca)$importance[2,
1] * 100, 2), "% variance"), y = paste0("PC2, ", round(summary(pca)$importance[2,
2] * 100, 2), "% variance")) + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("PCA - 500 HVGs")
ggplot(data = loadings) + geom_point(mapping = aes(x = PC1, y = PC2, colour = .names2),
show.legend = TRUE) + theme_bw() + labs(x = paste0("PC1, ", round(summary(pca)$importance[2,
1] * 100, 2), "% variance"), y = paste0("PC2, ", round(summary(pca)$importance[2,
2] * 100, 2), "% variance")) + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("PCA - 500 HVGs")
tmp <- pheatmap(hmDat, clustering_method = "ward.D2", main = "500 HVGs, all data",
show_rownames = FALSE, annotation_col = metadata[, c(1, 3), drop = FALSE], cutree_cols = 4)
dodgy.samples <- grep(paste(c("0412", "1218"), collapse = "|"), colnames(bulk.data),
value = TRUE)
metadata$is.dodgy <- ifelse(rownames(metadata) %in% dodgy.samples, "yes", "no")
pheatmap(hmDat, clustering_method = "ward.D2", main = "500 HVGs, all data", show_rownames = FALSE,
annotation_col = metadata[, c(1, 3, 4), drop = FALSE], cutree_cols = 4)
pheatmap(hmDat, clustering_method = "complete", main = "500 HVGs, all data, complete clustering",
show_rownames = FALSE, annotation_col = metadata[, c(1, 3, 4), drop = FALSE],
cutree_cols = 4)
I increased the treshold for good quality nucleotides. For some reason, the quality of nucletides coming from our analysis if not continous. I only keep nucleotides with phred>34. See the run.txt script in data.path/Counts2 to see how it was calculated.
# Names of samples:
samples <- c("0062", "0231", "0412", "0553", "0832", "0848", "0931", "1075", "1218",
"1281", "1533", "1566", "1661", "1790") #RNA-seq data
samples <- samples[!(samples %in% c("0412", "1218"))] #RNA-seq data
# conditions<-c('BE','GC','D2','NE') # RNA-seq data
conditions <- c("BE", "GC", "NE") # RNA-seq data
# location of the figures
files.dir <- "~/Dropbox/Postdoc/2019-12-29_BE2020/RNA-seq/"
condition.map <- c(NE = "NE", BE = "BE", GC = "NG")
# Set Conditions
z = 100
vaf = 0.03
# data location
data.path <- "~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Figure_3/"
# data.path2<-'/home/karolno/Dropbox/Postdoc/2019-06-30_Data/Figure_3E'
# data.path<-'/home/karolno/Dropbox/Postdoc/2019-04-15_Mitotracing/'
# positions to be removed
positions <- c(302:317, 295, 2617, 3107, 13710)
# create list for all the data
all.data <- list()
# Read data all RNA-seq samples all conditions for RNA-seq
for (sample in samples) {
for (condition in conditions) {
all.data[[paste0(sample, "_", condition.map[condition], "_RNA")]] <- read.delim(file = paste0(data.path,
"/Figure_3E/Counts2/RNA-seq/AHM_", sample, "_", condition, "_Aligned_MT_q.counts"),
stringsAsFactors = FALSE)[, c(1:4, 6, 8, 14, 16, 18, 20)] #keeps information about the count of mutatations
}
}
# create an empty data frame to store all of the data
all.data2 <- data.frame(chrom = "MT", pos = 1:17000, ref = "N") #arbitrary size of the chromosome. count 0 is eventually removed
all.data2$ref <- as.character(all.data2$ref)
all.data2 <- all.data2[!(all.data2$pos %in% positions), ]
# create an empty data frame to store coverage data
all.cov <- data.frame(chrom = "MT", pos = 1:17000, ref = "N") #will contain coverage data
all.cov <- all.cov[!(all.cov$pos %in% positions), ]
all.AF <- matrix() # will contain AF data
all.cov.test <- matrix() #will contain coverage test
all.AF.test <- matrix() # will contain AF test
# populated the data for each sample
for (name in names(all.data)) {
tmp <- all.data[[name]] #copy data to dataframe from list of dataframes
colnames(tmp)[c(4, 7:10)] <- paste0(c("reads", colnames(tmp)[c(7:10)]), "_",
name) #change column names
# all.data2$ref[as.numeric(tmp$pos)]<-tmp$ref #insert a nucleotide for the
# position probably not needed
all.data2 <- merge(all.data2, tmp[, c(1:2, 7:10)], by = 1:2, all.x = TRUE)
all.cov <- merge(all.cov, tmp[, c(1, 2, 4, 4, 4, 4)], by = 1:2, all.x = TRUE)
}
# get allele fractions
all.AF <- all.data2[, 4:ncol(all.data2)]/all.cov[, 4:ncol(all.cov)]
# get the dominant allele per sample
all.AF0.1 <- all.AF
colnames(all.AF0.1) <- rep(c("A", "C", "T", "G"), times = ncol(all.AF0.1)/4)
all.AF0.2 <- matrix(ncol = 4, nrow = nrow(all.AF0.1))
colnames(all.AF0.2) <- c("A", "C", "T", "G")
all.AF0.2[, 1] <- apply(all.AF0.1[, colnames(all.AF0.1) == "A"], 1, median, na.rm = TRUE)
all.AF0.2[, 2] <- apply(all.AF0.1[, colnames(all.AF0.1) == "C"], 1, median, na.rm = TRUE)
all.AF0.2[, 3] <- apply(all.AF0.1[, colnames(all.AF0.1) == "T"], 1, median, na.rm = TRUE)
all.AF0.2[, 4] <- apply(all.AF0.1[, colnames(all.AF0.1) == "G"], 1, median, na.rm = TRUE)
all.AF0.2[is.na(all.AF0.2)] <- 0
all.data2$ref <- unlist(apply(all.AF0.2, 1, function(x) ifelse(max(x) > 0, names(x)[x ==
max(x)], "N")))
# test for the AF
all.AF.test <- all.AF >= vaf
all.AF.test[is.na(all.AF.test)] <- FALSE
# test for coverage
all.cov.test <- all.cov[, 4:ncol(all.cov)] >= z
all.cov.test[is.na(all.cov.test)] <- FALSE
all.cov.test2 <- apply(all.cov.test, 1, all)
# get combined data
# This is to get the combined test
combined <- all.AF.test & all.cov.test
combined1 <- apply(combined, 1, any)
combined2 <- combined1 & all.cov.test2
# selection only mutations that matter
all.data2.1 <- all.data2[combined2, ]
all.AF2 <- all.AF[combined2, ]
shared2 <- all.data2.1
shared2[, 4:ncol(shared2)] <- all.AF2
# melt the tables into a pivot table. I use it to get the information about
# individual alleles per position
shared3 <- melt(shared2, id.vars = 1:3, variable.name = "ID", value.name = "AF")
# create sample info columns
shared4 <- separate(shared3, col = 4, sep = "_", remove = FALSE, into = c("allele",
"patient", "tissue", "Data"))
# create tracking column
shared4$tracking <- paste0(shared4$patient, "_", shared4$tissue, "_", shared4$Data)
# remove total count from data
shared5 <- shared4[shared4$allele != "reads", ]
# get only mutations (I don't use AF for the dominant allele at each position)
shared5 <- shared4[shared4$ref != shared4$allele, ]
# restruct the data into a table wiht with proper shape for plotting
shared6 <- dcast(shared5, chrom + pos + ref + allele ~ tracking, value.var = "AF")
# choose positons wth appropriate vaf
shared6.test <- shared6[, 5:ncol(shared6)] >= vaf
shared6.test[is.na(shared6.test)] <- FALSE
shared6.test2 <- apply(shared6.test, 1, any)
shared8 <- shared6[shared6.test2, ]
shared8[is.na(shared8)] <- 0
# create a table of results
results <- matrix(nrow = length(all.data), ncol = length(all.data))
# rownames(results)<-sort(names(all.data))
# colnames(results)<-sort(names(all.data))
rownames(results) <- names(all.data)
colnames(results) <- names(all.data)
# calculate pearson correlation between the samples
for (n in rownames(results)) {
for (nn in colnames(results)) {
results[n, nn] <- cor(sqrt(shared8[, colnames(shared8) == n]), sqrt(shared8[,
colnames(shared8) == nn]), method = "pearson")
}
}
# Get ID for heatmaps
ID <- (as.data.frame(rownames(results)) %>% separate(1, c("Patient", "Tissue", "Data"),
"_"))[, 1:2]
rownames(ID) <- rownames(results)
ID$Patient <- paste0("AHM", ID$Patient)
rowID <- shared8[, 2:4]
# add type of mutations data
rowID$mut <- factor(paste0(rowID$ref, ">", rowID$allele), levels = c("A>C", "A>G",
"A>T", "G>A", "G>C", "G>T", "T>G", "T>C", "T>A", "C>T", "C>G", "C>A"))
levels(rowID$mut)[levels(rowID$mut) %in% c("A>C", "A>G", "A>T", "G>A", "G>C", "G>T")] <- c("T>G",
"T>C", "T>A", "C>T", "C>G", "C>A")
# print results
barplot(table(rowID$mut), main = "Frequency of mutations in all samples")
pheatmap(sqrt(shared8[, 5:ncol(shared8)]), cluster_cols = FALSE, cluster_rows = FALSE,
main = "Variable mutations sqrt(AF)", annotation_row = rowID, annotation_col = ID,
labels_row = shared8$pos, labels_col = paste0(ID$Patient, " ", ID$Tissue), clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation", clustering_method = "complete")
pheatmap(sqrt(shared8[, 5:ncol(shared8)]), cluster_cols = FALSE, cluster_rows = TRUE,
main = "Variable mutations sqrt(AF)", annotation_row = rowID, annotation_col = ID,
labels_row = shared8$pos, labels_col = paste0(ID$Patient, " ", ID$Tissue), clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation", clustering_method = "complete")
pheatmap(sqrt(shared8[, 5:ncol(shared8)]), cluster_cols = TRUE, cluster_rows = TRUE,
main = "Variable mutations sqrt(AF)", annotation_row = rowID, annotation_col = ID,
labels_row = shared8$pos, labels_col = paste0(ID$Patient, " ", ID$Tissue), clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation", clustering_method = "complete")
results2 <- results
breaksList5 = seq(floor(min(c(results2, 0)) * 20)/20, 1, by = 0.05)
breaksList5 = seq(-1, 1, by = 0.05)
pheatmap(results2, annotation_col = ID, annotation_row = ID, na_col = "black", scale = "none",
cluster_cols = FALSE, cluster_rows = FALSE, clustering_method = "ward.D2", main = "All data no clustering",
breaks = breaksList5, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksList5)),
labels_row = paste0(ID$Patient, " ", ID$Tissue), labels_col = paste0(ID$Patient,
" ", ID$Tissue))
sample.names <- paste0(c("BE", "NG", "NE"), " ", rep("Patient B", ncol(results2)),
sprintf("%02d", ceiling((1:ncol(results2))/3)))
# sample.names<-sapply(sapply(rownames(results2), strsplit, '_'), function (x)
# paste0('AHM',x[1],' ',x[2]))
results2.1 <- results2
rownames(results2.1) <- sample.names
colnames(results2.1) <- sample.names
corrplot(results2.1, method = "circle", na.label = "NA", addgrid.col = NA, cl.length = 11,
tl.col = "black", tl.cex = 0.75, type = "upper", tl.pos = "tl", col = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(100), order = "original", tl.srt = 90)
pheatmap(results2, annotation_col = ID, annotation_row = ID, na_col = "black", scale = "none",
cluster_cols = TRUE, cluster_rows = TRUE, clustering_distance_rows = as.dist((1 -
(results2))/2), clustering_distance_cols = as.dist((1 - (results2))/2), clustering_method = "complete",
main = "All data with clustering - complete", breaks = breaksList5, color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(length(breaksList5)), labels_row = paste0(ID$Patient, " ",
ID$Tissue), labels_col = paste0(ID$Patient, " ", ID$Tissue), width = 22,
height = 20)
pheatmap(results2, annotation_col = ID, annotation_row = ID, na_col = "black", scale = "none",
cluster_cols = TRUE, cluster_rows = TRUE, clustering_distance_rows = as.dist((1 -
(results2))/2), clustering_distance_cols = as.dist((1 - (results2))/2), clustering_method = "ward.D2",
main = "All data with clustering ward.D2", breaks = breaksList5, color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(length(breaksList5)), labels_row = paste0(ID$Patient, " ",
ID$Tissue), labels_col = paste0(ID$Patient, " ", ID$Tissue), width = 22,
height = 20)
# produce a table of results
results.mito <- matrix(nrow = length(all.data), ncol = length(all.data))
# add names
rownames(results.mito) <- names(all.data)
colnames(results.mito) <- names(all.data)
# z=200 vaf=0.01 count=5
z = 100
reads = 0
vaf = 0.03
# positions<-c(295, 2617, 3107, 13710)
# calculated mitochondrial distance between all samples
for (n in 1:length(rownames(results.mito))) {
# print(n)
for (nn in n:length(colnames(results.mito))) {
tmp <- mitodist(all.data[[rownames(results.mito)[n]]], all.data[[colnames(results.mito)[nn]]],
z = z, count = reads, vaf = vaf, positions = positions)
results.mito[n, nn] <- tmp
results.mito[nn, n] <- tmp
}
}
# create IDs
ID <- (as.data.frame(rownames(results.mito)) %>% separate(1, c("Patient", "Tissue",
"Data"), "_"))
rownames(ID) <- rownames(results.mito)
# change the distance to bigger values (factor 100)
results.mito.1 <- results.mito * 100
sample.names <- sapply(sapply(rownames(results.mito.1), strsplit, "_"), function(x) paste0("AHM",
x[1], " ", x[2]))
# plot heatmap of mitochondrial distance
pheatmap(results.mito, annotation_col = ID, annotation_row = ID, na_col = "black",
scale = "none", cluster_cols = TRUE, cluster_rows = TRUE, clustering_distance_rows = as.dist(results.mito),
clustering_distance_cols = as.dist(results.mito), clustering_method = "complete",
main = "All data with clustering - complete", labels_row = sample.names, labels_col = sample.names)
pheatmap(results.mito.1, annotation_col = ID, annotation_row = ID, na_col = "black",
scale = "none", cluster_cols = TRUE, cluster_rows = TRUE, clustering_distance_rows = as.dist(results.mito.1),
clustering_distance_cols = as.dist(results.mito.1), clustering_method = "complete",
main = "All data scaled with clustering - complete", labels_row = sample.names,
labels_col = sample.names)
# change names of the samples
rownames(results.mito.1) <- sample.names
colnames(results.mito.1) <- sample.names
corrplot(results.mito.1, method = "circle", type = "lower", is.corr = FALSE, col = colorRampPalette(brewer.pal(n = 3,
name = "Reds"))(40), cl.lim = c(0, 1), addgrid.col = NA, cl.length = 11, tl.col = "black",
tl.cex = 0.75, tl.srt = 45)
corrplot(results.mito.1, method = "circle", type = "upper", is.corr = FALSE, col = colorRampPalette(brewer.pal(n = 3,
name = "Reds"))(40), cl.lim = c(0, 1), addgrid.col = NA, cl.length = 11, tl.col = "black",
tl.cex = 0.75, tl.srt = 45)
# create IDs
ID <- (as.data.frame(rownames(results.mito.1)) %>% separate(1, c("Patient", "Tissue"),
" "))
rownames(ID) <- rownames(results.mito.1)
# ID$Patient <- paste0('AHM',ID$Patient) image annotation
anno_col <- list(Patient = brewer.pal(12, "Set3"), Tissue = c(NG = "#4DBBD5FF", NE = "darkred",
BE = "#00A087FF"))
names(anno_col$Patient) <- unique(ID$Patient)
dev.off()
## null device
## 1
pdf(paste0(data.path, "/Figure_3E.pdf"), useDingbats = FALSE, width = 11, height = 11)
pheatmap(results.mito.1, annotation_col = ID, annotation_row = ID, na_col = "black",
scale = "none", cluster_cols = FALSE, cluster_rows = FALSE, clustering_distance_rows = as.dist(results.mito.1),
clustering_distance_cols = as.dist(results.mito.1), clustering_method = "complete",
main = "All data scaled with clustering - complete", labels_row = sample.names,
labels_col = sample.names, annotation_colors = anno_col)
corrplot(results2.1, method = "circle", na.label = "NA", addgrid.col = NA, cl.length = 11,
tl.col = "black", tl.cex = 0.75, type = "upper", tl.pos = "tl", col = colorRampPalette(rev(brewer.pal(n = 9,
name = "RdBu")))(100), order = "original", tl.srt = 90)
corrplot(results2.1, method = "circle", na.label = "NA", addgrid.col = NA, cl.length = 11,
tl.col = "black", tl.cex = 0.75, type = "upper", tl.pos = "td", col = colorRampPalette(rev(brewer.pal(n = 9,
name = "RdBu")))(100), order = "original", tl.srt = 45)
corrplot(results.mito.1, method = "circle", type = "lower", is.corr = FALSE, col = colorRampPalette(c("white",
"white", "blue"))(100)[10:100], cl.lim = c(0, 0.8), addgrid.col = "#BEBEBE40",
cl.length = 9, tl.col = "black", tl.cex = 0.75, tl.srt = 45)
corrplot((1 - (results.mito.1)), method = "circle", type = "lower", is.corr = FALSE,
col = colorRampPalette(rev(brewer.pal(n = 9, name = "RdBu")))(100)[1:100], cl.lim = c(0.2,
1), addgrid.col = NA, cl.length = 9, tl.col = "black", tl.cex = 0.75, tl.srt = 45)
corrplot((1 - (results.mito.1)), method = "circle", type = "lower", is.corr = FALSE,
col = colorRampPalette(rev(brewer.pal(n = 9, name = "RdBu")))(100)[1:100], cl.lim = c(0.3,
1), addgrid.col = NA, cl.length = 8, tl.col = "black", tl.cex = 0.75, tl.srt = 45)
corrplot(results.mito.1, method = "circle", type = "upper", is.corr = FALSE, col = colorRampPalette(c("white",
"white", "blue"))(100)[10:100], cl.lim = c(0, 0.8), addgrid.col = "#BEBEBE40",
cl.length = 9, tl.col = "black", tl.cex = 0.75, tl.srt = 45)
dev.off()
## null device
## 1
To finish get session info:
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora 31 (Workstation Edition)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] corrplot_0.84 RColorBrewer_1.1-2 reshape2_1.4.3 tidyr_1.0.2
## [5] pheatmap_1.0.12 ggplot2_3.2.1 edgeR_3.28.0 limma_3.42.2
## [9] rmarkdown_2.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 plyr_1.8.5 pillar_1.4.3 compiler_3.6.2
## [5] formatR_1.7 tools_3.6.2 statmod_1.4.33 digest_0.6.24
## [9] evaluate_0.14 lifecycle_0.1.0 tibble_2.1.3 gtable_0.3.0
## [13] lattice_0.20-38 pkgconfig_2.0.3 rlang_0.4.4 yaml_2.2.1
## [17] xfun_0.12 withr_2.1.2 stringr_1.4.0 dplyr_0.8.4
## [21] knitr_1.28 vctrs_0.2.2 locfit_1.5-9.1 grid_3.6.2
## [25] tidyselect_1.0.0 glue_1.3.1 R6_2.4.1 farver_2.0.3
## [29] purrr_0.3.3 magrittr_1.5 ellipsis_0.3.0 splines_3.6.2
## [33] scales_1.1.0 htmltools_0.4.0 assertthat_0.2.1 colorspace_1.4-1
## [37] labeling_0.3 stringi_1.4.5 lazyeval_0.2.2 munsell_0.5.0
## [41] crayon_1.3.4